library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
feature_description_original <- readxl::read_excel(
"data/feature_description.xlsx")
feature_description_original
customer_segmentation_raw <- read_csv2(
"data/customer_segmentation_test.csv",
col_types = list(col_character(), col_character(), col_character(), col_character(),
col_double(), col_double(), col_character(), col_double(), col_double(),
col_character(), col_double(), col_double(), col_character(), col_double(),
col_double(), col_character(), col_double(), col_double(), col_character(),
col_character(), col_character()),
guess_max = 400000
) %>% mutate(
`Date of Birth` = lubridate::dmy(`Date of Birth`),
Gender = as.factor(Gender),
MERCHANDISE2015 = as.factor(MERCHANDISE2015),
MERCHANDISE2016 = as.factor(MERCHANDISE2016),
MERCHANDISE2017 = as.factor(MERCHANDIESE2017),
MERCHANDISE2018 = as.factor(MERCHANDIESE2018),
MERCHANDISE2019 = as.factor(MERCHANDISE2019),
LastPaymentDate = lubridate::dmy(LastPaymentDate),
PenultimatePaymentDate = lubridate::dmy(PenultimatePaymentDate)
) %>% select(-c(MERCHANDIESE2017, MERCHANDIESE2018)) %>%
rename(DateOfBirth = `Date of Birth`,
ID =`Customer Number`)
## i Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
skimr::skim(customer_segmentation_raw)
| Name | customer_segmentation_raw |
| Number of rows | 406734 |
| Number of columns | 21 |
| _______________________ | |
| Column type frequency: | |
| character | 2 |
| Date | 3 |
| factor | 6 |
| numeric | 10 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ID | 0 | 1.00 | 10 | 10 | 0 | 406734 | 0 |
| Postcode | 9176 | 0.98 | 1 | 9 | 0 | 2982 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| DateOfBirth | 155491 | 0.62 | 1902-04-21 | 2015-03-30 | 1948-03-09 | 25514 |
| LastPaymentDate | 0 | 1.00 | 2015-01-03 | 2020-02-13 | 2018-12-06 | 1361 |
| PenultimatePaymentDate | 44699 | 0.89 | 1995-12-31 | 2020-02-05 | 2017-04-12 | 5376 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| Gender | 0 | 1 | FALSE | 3 | fem: 203904, mal: 183467, fam: 19363 |
| MERCHANDISE2015 | 0 | 1 | FALSE | 2 | 0: 401845, 1: 4889 |
| MERCHANDISE2016 | 0 | 1 | FALSE | 2 | 0: 401585, 1: 5149 |
| MERCHANDISE2019 | 0 | 1 | FALSE | 2 | 0: 401470, 1: 5264 |
| MERCHANDISE2017 | 0 | 1 | FALSE | 2 | 0: 402378, 1: 4356 |
| MERCHANDISE2018 | 0 | 1 | FALSE | 2 | 0: 401470, 1: 5264 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| COUNT2015 | 0 | 1 | 2.52 | 4.00 | 0 | 0 | 2 | 2 | 96.0 | ▇▁▁▁▁ |
| SUM2015 | 0 | 1 | 42.44 | 850.19 | 0 | 0 | 15 | 45 | 388113.6 | ▇▁▁▁▁ |
| COUNT2016 | 0 | 1 | 1.22 | 2.02 | 0 | 0 | 1 | 1 | 178.0 | ▇▁▁▁▁ |
| SUM2016 | 0 | 1 | 50.93 | 591.05 | 0 | 0 | 16 | 50 | 295599.8 | ▇▁▁▁▁ |
| COUNT2017 | 0 | 1 | 1.06 | 1.91 | 0 | 0 | 0 | 1 | 95.0 | ▇▁▁▁▁ |
| SUM2017 | 0 | 1 | 24.78 | 572.90 | 0 | 0 | 0 | 20 | 207134.7 | ▇▁▁▁▁ |
| COUNT2018 | 0 | 1 | 1.00 | 1.87 | 0 | 0 | 0 | 1 | 49.0 | ▇▁▁▁▁ |
| SUM2018 | 0 | 1 | 20.64 | 1552.60 | 0 | 0 | 0 | 15 | 911146.5 | ▇▁▁▁▁ |
| COUNT2019 | 0 | 1 | 0.97 | 1.79 | 0 | 0 | 0 | 1 | 31.0 | ▇▁▁▁▁ |
| SUM2019 | 0 | 1 | 46.44 | 3999.80 | 0 | 0 | 0 | 30 | 2400000.0 | ▇▁▁▁▁ |
Bin hier sehr offen für Verbesserungsvorschläge ^^
zip_code_list <- readxl::read_excel("data/PLZ_Verzeichnis-20211201.xls")
zip_code_list
customer_segmentation_with_zip <- customer_segmentation_raw %>%
left_join(zip_code_list, by = c("Postcode" = "PLZ")) %>%
select(-c(`gültig ab`, `gültig bis`, NamePLZTyp, intern_extern, adressierbar, Postfach)) %>%
drop_na(Postcode, Ort, Bundesland) %>%
mutate(Postcode = as.factor(Postcode),
Bundesland = as.factor(Bundesland))
customer_segmentation_with_zip
# here we define, which months should be understood as "christmas months" to define "XMAS_donation"
XMAS_months = c(11,
12,
1)
# this date will be used as the reference for this analysis
reference_date <- lubridate::ymd("2021-12-17")
customer_segmentation_first_prepro <- customer_segmentation_with_zip %>%
mutate(
# year of customer's birthday
year_born = lubridate::year(DateOfBirth),
# age of donors at their last donation
age_at_last_donation = lubridate::interval(DateOfBirth, LastPaymentDate) %>%
as.numeric("years") %>%
as.integer(),
generation_moniker = case_when(
year_born <= 1945 ~ "silent" ,
year_born <= 1964 ~ "boomer",
year_born <= 1980 ~ "x",
year_born <= 1996 ~ "millennial",
year_born <= 2012 ~ "z"
) %>% as_factor(),
# total number of donations over all years
COUNTtotal = COUNT2015+
COUNT2016+
COUNT2017+
COUNT2018+
COUNT2019,
# total donation amount over all years
SUMtotal = SUM2015+
SUM2016+
SUM2017+
SUM2018+
SUM2019,
# average donation amount
SUMaverage = SUMtotal / COUNTtotal,
# month of the last payment
LastPaymentMONTH = lubridate::month(LastPaymentDate) %>% as.factor(),
# month of second to last payment
PenultimatePaymentMONTH = lubridate::month(PenultimatePaymentDate) %>% as.factor(),
# year of the last payment
LastPaymentYEAR = lubridate::year(LastPaymentDate),
# year of second to last payment
PenultimatePaymentYEAR = lubridate::year(PenultimatePaymentDate),
# THIS ONE NEEDS WORK
# status as christmas donor if the last two payments were around christmas,
# but we have to tweak the time interval (is Nov to Jan too large?)
# also: what about people that only have one payment in total, that should be considered. The "maybe" status is shady at best
XMAS_donor = as_factor(case_when(LastPaymentMONTH %in% XMAS_months & PenultimatePaymentMONTH %in% XMAS_months ~ "yes",
LastPaymentMONTH %in% XMAS_months ~ "maybe",
TRUE ~ "unlikely")),
# days between last and second to last payment
donation_interval = lubridate::day(lubridate::days(LastPaymentDate - PenultimatePaymentDate)),
# days since the last payment in relation to our reference date
days_since_last_payment = as.integer(LastPaymentDate - reference_date),
# binary factor variable expressing if any merchandise was bought over the observation period (clumsily coded)
merchandise_any = as_factor(if_else(
!is.na(MERCHANDISE2015) & MERCHANDISE2015 != 0 |
!is.na(MERCHANDISE2016) & MERCHANDISE2016 != 0 |
!is.na(MERCHANDISE2017) & MERCHANDISE2017 != 0 |
!is.na(MERCHANDISE2018) & MERCHANDISE2018 != 0 |
!is.na(MERCHANDISE2019) & MERCHANDISE2019 != 0,
1,
0))) %>%
# grouping for the next mutation (num_of_donation_years)
group_by(ID) %>%
# number of years in which anything was donated (0-5)
mutate(num_of_donation_years = sum(COUNT2015 > 0,
COUNT2016 > 0,
COUNT2017 > 0,
COUNT2018 > 0,
COUNT2019 > 0, na.rm=T)) %>%
# ungrouping is important! ;)
# I learned that skimr tries to show its output based on groups if working with a grouped dataset... that crashed my computer twice ^^
ungroup() %>%
# remove variables that have no further use or
select(-c(ID, DateOfBirth, LastPaymentDate, PenultimatePaymentDate))
customer_segmentation_first_prepro
customer_segmentation_first_prepro %>% skimr::skim()
| Name | Piped data |
| Number of rows | 396694 |
| Number of columns | 34 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| factor | 13 |
| numeric | 20 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Ort | 0 | 1 | 2 | 40 | 0 | 2178 | 0 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| Gender | 0 | 1.00 | FALSE | 3 | fem: 199545, mal: 179215, fam: 17934 |
| Postcode | 0 | 1.00 | FALSE | 2249 | 122: 6776, 121: 6208, 110: 5941, 502: 5383 |
| MERCHANDISE2015 | 0 | 1.00 | FALSE | 2 | 0: 391818, 1: 4876 |
| MERCHANDISE2016 | 0 | 1.00 | FALSE | 2 | 0: 391552, 1: 5142 |
| MERCHANDISE2019 | 0 | 1.00 | FALSE | 2 | 0: 391460, 1: 5234 |
| MERCHANDISE2017 | 0 | 1.00 | FALSE | 2 | 0: 392339, 1: 4355 |
| MERCHANDISE2018 | 0 | 1.00 | FALSE | 2 | 0: 391460, 1: 5234 |
| Bundesland | 0 | 1.00 | FALSE | 9 | N: 88175, W: 70706, O: 66082, St: 57348 |
| generation_moniker | 146208 | 0.63 | FALSE | 5 | sil: 110508, boo: 102068, x: 33020, mil: 4734 |
| LastPaymentMONTH | 0 | 1.00 | FALSE | 12 | 12: 119035, 11: 66379, 1: 45775, 10: 42275 |
| PenultimatePaymentMONTH | 37875 | 0.90 | FALSE | 12 | 12: 91203, 11: 56900, 10: 42674, 1: 27463 |
| XMAS_donor | 0 | 1.00 | FALSE | 3 | unl: 165505, may: 119746, yes: 111443 |
| merchandise_any | 0 | 1.00 | FALSE | 2 | 0: 377620, 1: 19074 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| COUNT2015 | 0 | 1.00 | 2.56 | 4.03 | 0.00 | 0.00 | 2.00 | 4.00 | 96.0 | ▇▁▁▁▁ |
| SUM2015 | 0 | 1.00 | 41.12 | 724.36 | 0.00 | 0.00 | 15.00 | 45.00 | 388113.6 | ▇▁▁▁▁ |
| COUNT2016 | 0 | 1.00 | 1.24 | 2.03 | 0.00 | 0.00 | 1.00 | 1.00 | 178.0 | ▇▁▁▁▁ |
| SUM2016 | 0 | 1.00 | 51.20 | 596.95 | 0.00 | 0.00 | 20.00 | 50.00 | 295599.8 | ▇▁▁▁▁ |
| COUNT2017 | 0 | 1.00 | 1.08 | 1.92 | 0.00 | 0.00 | 0.00 | 1.00 | 95.0 | ▇▁▁▁▁ |
| SUM2017 | 0 | 1.00 | 24.45 | 484.85 | 0.00 | 0.00 | 0.00 | 20.00 | 207134.7 | ▇▁▁▁▁ |
| COUNT2018 | 0 | 1.00 | 1.02 | 1.88 | 0.00 | 0.00 | 0.00 | 1.00 | 49.0 | ▇▁▁▁▁ |
| SUM2018 | 0 | 1.00 | 20.76 | 1570.91 | 0.00 | 0.00 | 0.00 | 15.00 | 911146.5 | ▇▁▁▁▁ |
| COUNT2019 | 0 | 1.00 | 0.98 | 1.80 | 0.00 | 0.00 | 0.00 | 1.00 | 31.0 | ▇▁▁▁▁ |
| SUM2019 | 0 | 1.00 | 46.90 | 4049.95 | 0.00 | 0.00 | 0.00 | 30.00 | 2400000.0 | ▇▁▁▁▁ |
| year_born | 146204 | 0.63 | 1949.25 | 14.01 | 1902.00 | 1939.00 | 1948.00 | 1959.00 | 2015.0 | ▁▇▇▂▁ |
| age_at_last_donation | 146204 | 0.63 | 68.33 | 14.00 | 0.00 | 59.00 | 70.00 | 79.00 | 117.0 | ▁▁▇▇▁ |
| COUNTtotal | 0 | 1.00 | 6.87 | 9.93 | 1.00 | 2.00 | 3.00 | 7.00 | 273.0 | ▇▁▁▁▁ |
| SUMtotal | 0 | 1.00 | 184.43 | 4898.70 | 0.01 | 30.00 | 65.00 | 160.00 | 2400225.0 | ▇▁▁▁▁ |
| SUMaverage | 0 | 1.00 | 36.08 | 1530.61 | 0.01 | 11.25 | 17.34 | 29.42 | 750000.0 | ▇▁▁▁▁ |
| LastPaymentYEAR | 0 | 1.00 | 2017.78 | 1.53 | 2015.00 | 2016.00 | 2018.00 | 2019.00 | 2020.0 | ▅▂▃▇▂ |
| PenultimatePaymentYEAR | 37875 | 0.90 | 2015.72 | 3.91 | 1995.00 | 2015.00 | 2017.00 | 2018.00 | 2020.0 | ▁▁▁▃▇ |
| donation_interval | 37875 | 0.90 | 773.66 | 1215.88 | 1.00 | 123.00 | 354.00 | 762.00 | 8766.0 | ▇▁▁▁▁ |
| days_since_last_payment | 0 | 1.00 | -1293.24 | 561.24 | -2540.00 | -1814.00 | -1102.00 | -762.00 | -673.0 | ▂▂▂▃▇ |
| num_of_donation_years | 0 | 1.00 | 2.50 | 1.49 | 1.00 | 1.00 | 2.00 | 4.00 | 5.0 | ▇▅▃▂▃ |
#Maybe it's a good idea to take out all the NAs for age. Obviously we lose a lot of rows, but 251000 left still seems plenty to me.
customer_segmentation_complete <- customer_segmentation_first_prepro %>% drop_na(year_born)
customer_segmentation_complete
ggplot(customer_segmentation_first_prepro, aes(XMAS_donor)) +
geom_bar() +
facet_wrap(~Gender)
ggplot(customer_segmentation_first_prepro, aes(num_of_donation_years)) +
geom_bar() +
facet_wrap(~generation_moniker)
ggplot(customer_segmentation_first_prepro %>% drop_na(age_at_last_donation), aes(age_at_last_donation)) +
geom_histogram(binwidth = 5)
ggplot(customer_segmentation_first_prepro %>% filter(SUMtotal > 0 & SUMtotal < 5000), aes(x = SUMtotal)) +
geom_histogram(binwidth = 100) +
facet_wrap(~Gender)
ggplot(customer_segmentation_first_prepro, aes(LastPaymentMONTH)) +
geom_bar() +
facet_wrap(~Gender)
ggplot(customer_segmentation_first_prepro, aes(PenultimatePaymentMONTH)) +
geom_bar() +
facet_wrap(~Gender)
ggplot(customer_segmentation_first_prepro %>% filter(COUNTtotal < (7 * 6)), aes(COUNTtotal)) +
geom_histogram(binwidth = 1)
ggplot(customer_segmentation_first_prepro %>% drop_na(donation_interval) %>% filter(donation_interval < (360 * 5)), aes(donation_interval)) +
geom_histogram(binwidth = 30)
mean_total_sum <- customer_segmentation_first_prepro$SUMtotal %>% mean(na.rm = TRUE)
sd_total_sum <- customer_segmentation_first_prepro$SUMtotal %>% sd(na.rm = TRUE)
ggplot(customer_segmentation_first_prepro %>% drop_na(year_born) %>% filter(SUMtotal < (mean_total_sum + sd_total_sum * 6)), aes(year_born, SUMtotal)) +
geom_point(alpha = 1 / 10)
# taken from https://de.statista.com/statistik/daten/studie/75396/umfrage/entwicklung-der-bevoelkerung-in-oesterreich-nach-bundesland-seit-1996/
pop_vienna <- 1921153
pop_lower_austria <- 1691040
pop_upper_austria <- 1495756
pop_styria <- 1247159
pop_tyrol <- 760161
pop_carithia <- 562230
pop_salzburg <- 560643
pop_vorarlberg <- 399164
pop_burgenland <- 296040
donors_per_state_per_100_000_inhabitants <- customer_segmentation_first_prepro %>%
select(Bundesland) %>%
group_by(Bundesland) %>%
count() %>%
ungroup() %>%
mutate(
n = case_when(
Bundesland == "B" ~ n / pop_burgenland * 100000,
Bundesland == "K" ~ n / pop_carithia * 100000,
Bundesland == "N" ~ n / pop_lower_austria * 100000,
Bundesland == "O" ~ n / pop_upper_austria * 100000,
Bundesland == "Sa" ~ n / pop_salzburg * 100000,
Bundesland == "St" ~ n / pop_styria * 100000,
Bundesland == "T" ~ n / pop_tyrol * 100000,
Bundesland == "V" ~ n / pop_vorarlberg * 100000,
Bundesland == "W" ~ n / pop_vienna * 100000
)
)
ggplot(donors_per_state_per_100_000_inhabitants, aes(Bundesland, n)) +
geom_col()
ggplot(customer_segmentation_first_prepro, aes(days_since_last_payment)) +
geom_histogram(binwidth = 30)
RFM segments customers according to three variabless: Recency, Frequency, Monetary Value. Using the rfm package, RFM scores can be computed either on raw transaction data (one row per transaction), or on aggregated customer data (one row per customer). For the former, the function rfm_table_order can be used, for the latter either rfm_table_customer or rfm_table_customer2. Since our dataset represents aggregated customer data, the latter should be used. It can be computer directly from the raw data:
library(rfm)
rfm_scores <- customer_segmentation_raw %>%
# create new variables: total donation sum; total number of donations
mutate(SUMtotal = SUM2015 + SUM2016 + SUM2017 + SUM2018 + SUM2019,
COUNTtotal = COUNT2015 + COUNT2016 + COUNT2017 + COUNT2018 + COUNT2019,
LastPaymentDate = as.Date(LastPaymentDate)) %>%
# compute RFM scores
rfm_table_customer_2(customer_id = ID,
n_transactions = COUNTtotal,
latest_visit_date = LastPaymentDate,
total_revenue = SUMtotal,
analysis_date = reference_date)
rfm_scores
## Warning in `[<-.data.frame`(`*tmp*`, is_list, value = list(`1` =
## "<tibble[,8]>", : replacement element 1 has 1 row to replace 0 rows
## Warning in `[<-.data.frame`(`*tmp*`, is_list, value = list(`1` =
## "<tibble[,8]>", : replacement element 2 has 1 row to replace 0 rows
Visual inspection of RFM scores:
We can see that higher monetary values are characterized by higher donation frequencies and more recent donations. There is an obvious cluster of low monetary value for frequency values in [1,2] and recency in [1,3]. These might be lost donors, i.e. customers who donated only a few times, obviously not active donors any more and thus unlikely do donate again in the future. In the upper left corner, we see very recent customers whowith low frequency (i.e. new donors) who donated sums above average for this recency score. It might be worth focusing on them, since they recently demonstrated above-average donation willingness. This segment may be calle prospects
could be as donors, since they showed above-average donation willingness among low-frequency donors. Note: The higher the recency score, the more recent the last transaction!
rfm_heatmap(rfm_scores)
The segmentation of customers is al follows:
segment_names <- c("Champions", "Loyal Customers", "Potential Loyalist",
"New Customers", "Promising", "Need Attention", "About To Sleep",
"At Risk", "Can't Lose Them", "Lost")
# We set the upper and lower bounds for recency, frequency, and monetary for the above segments
recency_lower <- c(4, 2, 3, 4, 3, 2, 2, 1, 1, 1)
recency_upper <- c(5, 5, 5, 5, 4, 3, 3, 2, 1, 2)
frequency_lower <- c(4, 3, 1, 1, 1, 2, 1, 2, 4, 1)
frequency_upper <- c(5, 5, 3, 1, 1, 3, 2, 5, 5, 2)
monetary_lower <- c(4, 3, 1, 1, 1, 2, 1, 2, 4, 1)
monetary_upper <- c(5, 5, 3, 1, 1, 3, 2, 5, 5, 2)
# We use the segments and the bounds we previously established to group our users into different segments
segment <- rfm_segment(rfm_scores,
segment_names,
recency_lower,
recency_upper,
frequency_lower,
frequency_upper,
monetary_lower,
monetary_upper)
segment %>% ggplot(aes(segment)) +
geom_bar()
segment$segment %>% table() %>% prop.table() %>% round(3) %>% sort(decreasing = T)
## .
## Loyal Customers Champions Potential Loyalist At Risk
## 0.241 0.211 0.185 0.107
## Lost About To Sleep Others Need Attention
## 0.085 0.080 0.063 0.027
Alternatively, I tried to define my own six segments based on the heatmap (note that this code throws an error):
Alternatively, I implemented my segments manually. However, the resulting segmetnation is not useful due to large amount ot others
rfm_segments <- rfm_scores$rfm %>%
mutate(segment = ifelse(recency_score %in% 4:5 & frequency_score %in% 4:5 & monetary_score %in% 4:5,
"Champ",
ifelse(recency_score %in% 4:5 & frequency_score %in% 2:3 & monetary_score %in% 1:3,
"Regular avg active",
ifelse(recency_score %in% 5:5 & frequency_score %in% 1:1 & monetary_score %in% 4:5,
"Prospect",
ifelse(recency_score %in% 4:4 & frequency_score %in% 1:1 & monetary_score %in% 1:3,
"Newbie",
ifelse(recency_score %in% 1:3 & frequency_score %in% 5:5 & monetary_score %in% 4:5,
"Don't loose",
ifelse(recency_score %in% 1:3 & frequency_score %in% 3:4 & monetary_score %in% 3:4,
"Regular avg at risk",
ifelse(recency_score %in% 1:3 & frequency_score %in% 1:2 & monetary_score %in% 1:2,
"Lost", "Other"))))))))
rfm_segments %>% ggplot(aes(segment)) +
geom_bar()
rfm_segments$segment %>% table() %>% prop.table() %>% round(3) %>% sort(decreasing = T)
## .
## Lost Other Champ Regular avg at risk
## 0.267 0.250 0.211 0.133
## Regular avg active Don't loose Newbie Prospect
## 0.068 0.048 0.021 0.002
Inspect behavior by segment:
rfm_plot_median_recency(segment)
rfm_plot_median_frequency(segment)
rfm_plot_median_monetary(segment)